AnĂ¡lisis de https://www.nature.com/articles/srep00196.pdf

Podemos usar read_lines_chunked si el archivo original es grande:

library(tidyverse)
limpiar <- function(lineas,...){
  str_split(lineas, ',') %>% 
    keep(function(x) x[1] == 'EastAsian') %>%
    map(function(x){
          ing <- x[-1]
          ing[nchar(ing) > 0]
        })
}
filtrado <- read_lines_chunked('../datos/recetas/srep00196-s3.csv',
                    skip = 1, callback = ListCallback$new(limpiar))
recetas <-  filtrado %>% flatten
library(arules)
length(recetas)
[1] 2512
pars <- list(support = 0.05,  target = 'frequent itemsets',
             ext = TRUE)
ap_recetas <- apriori(recetas, parameter = pars)
Apriori

Parameter specification:
 confidence minval smax arem  aval originalSupport maxtime support minlen maxlen            target  ext
         NA    0.1    1 none FALSE            TRUE       5    0.05      1     10 frequent itemsets TRUE

Algorithmic control:
 filter tree heap memopt load sort verbose
    0.1 TRUE TRUE  FALSE TRUE    2    TRUE

Absolute minimum support count: 125 

set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[242 item(s), 2512 transaction(s)] done [0.00s].
sorting and recoding items ... [41 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 4 5 6 done [0.01s].
sorting transactions ... done [0.00s].
writing ... [628 set(s)] done [0.00s].
creating S4 object  ... done [0.00s].
length(ap_recetas)
[1] 628

Vemos los items frecuentes

frecs <- ap_recetas %>% subset(size(.) == 1 ) %>% sort(by = 'support') %>%
 DATAFRAME
DT::datatable(frecs %>% mutate_if(is.numeric, function(x) round(x, 3)))

Y ahora examinamos combinaciones frecuentes de distintos tamaños

ap_recetas %>% 
  subset(size(.) == 3) %>%
  subset(support > 0.20) %>%
  sort(by = 'support') %>%
  inspect
    items                         support   transIdenticalToItemsets count
[1] {cayenne,garlic,scallion}     0.2440287 0.0007961783             613  
[2] {garlic,scallion,soy_sauce}   0.2308917 0.0000000000             580  
[3] {garlic,scallion,sesame_oil}  0.2050159 0.0000000000             515  
[4] {garlic,sesame_oil,soy_sauce} 0.2050159 0.0000000000             515  

Incluso hay algunas combinaciones de 4 ingredientes que ocurren con frecuencia alta: estos ingredientes son bases de salsas, combinaciones de condimentos:

ap_recetas %>% 
  subset(size(.) == 4) %>%
  subset(support > 0.10) %>%
  sort(by = 'support') %>%
  inspect
     items                                            support   transIdenticalToItemsets count
[1]  {garlic,scallion,sesame_oil,soy_sauce}           0.1544586 0                        388  
[2]  {cayenne,garlic,scallion,soy_sauce}              0.1425159 0                        358  
[3]  {cayenne,garlic,ginger,scallion}                 0.1337580 0                        336  
[4]  {cayenne,garlic,scallion,sesame_oil}             0.1297771 0                        326  
[5]  {black_pepper,garlic,scallion,soy_sauce}         0.1234076 0                        310  
[6]  {garlic,ginger,scallion,soy_sauce}               0.1134554 0                        285  
[7]  {cayenne,garlic,sesame_oil,soy_sauce}            0.1078822 0                        271  
[8]  {garlic,roasted_sesame_seed,scallion,sesame_oil} 0.1070860 0                        269  
[9]  {cayenne,garlic,scallion,soybean}                0.1027070 0                        258  
[10] {black_pepper,garlic,sesame_oil,soy_sauce}       0.1019108 0                        256  
[11] {cayenne,garlic,ginger,soy_sauce}                0.1015127 0                        255  
[12] {black_pepper,cayenne,garlic,scallion}           0.1007166 0                        253  
pars <- list(support = 0.005, confidence = 0.10,
             target = 'rules',
             ext = TRUE)
reglas_recetas <- apriori(recetas, parameter = pars)
Apriori

Parameter specification:
 confidence minval smax arem  aval originalSupport maxtime support minlen maxlen target  ext
        0.1    0.1    1 none FALSE            TRUE       5   0.005      1     10  rules TRUE

Algorithmic control:
 filter tree heap memopt load sort verbose
    0.1 TRUE TRUE  FALSE TRUE    2    TRUE

Absolute minimum support count: 12 

set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[242 item(s), 2512 transaction(s)] done [0.00s].
sorting and recoding items ... [117 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 4 5 6 7 8 9 10
Mining stopped (maxlen reached). Only patterns up to a length of 10 returned!
 done [0.05s].
writing ... [202584 rule(s)] done [0.04s].
creating S4 object  ... done [0.04s].
agregar_hyperlift <- function(reglas, trans){
  quality(reglas) <- cbind(quality(reglas), 
    hyper_lift = interestMeasure(reglas, measure = "hyperLift", 
    transactions = trans))
  reglas
}
reglas_recetas <- agregar_hyperlift(reglas_recetas, recetas)

AnĂ¡lisis de pares comunes

library(arulesViz)
reglas_1 <- subset(reglas_recetas, hyper_lift > 1.2 & support > 0.05 & confidence > 0.40)
length(reglas_1)
[1] 941
reglas_tam_2 <- subset(reglas_1, size(reglas_1)==2)
#inspect(reglas_tam_2 %>% sort(by = 'hyper_lift')) 
plotly_arules(reglas_1 %>% subset(support > 0.2))
library(tidygraph)
library(ggraph)
frecs <- 
df_reglas <- reglas_tam_2 %>% DATAFRAME %>% rename(from=LHS, to=RHS) %>% as_data_frame
df_reglas$weight <- log(df_reglas$lift)
graph_1 <- as_tbl_graph(df_reglas) %>%
  mutate(centrality = centrality_degree(mode = "all")) 
ggraph(graph_1, layout = 'fr') +
  geom_edge_link(aes(alpha=lift), 
                 colour = 'red',
                 arrow = arrow(length = unit(4, 'mm'))) + 
  geom_node_point(aes(size = centrality, colour = centrality)) + 
  geom_node_text(aes(label = name), size=4,
                 colour = 'gray20', repel=TRUE) +
  theme_graph()

reglas_1 <- subset(reglas_recetas, hyper_lift > 1.8 & confidence > 0.1)
length(reglas_1)
[1] 21772
reglas_tam_2 <- subset(reglas_1, size(reglas_1)==2)
length(reglas_tam_2)
[1] 142
library(tidygraph)
library(ggraph)
df_reglas <- reglas_tam_2 %>% DATAFRAME %>% rename(from=LHS, to=RHS) %>% as_data_frame
df_reglas$weight <- log(df_reglas$hyper_lift)
graph_1 <- as_tbl_graph(df_reglas) %>%
  mutate(centrality = centrality_degree(mode = "all")) 
ggraph(graph_1, layout = 'fr', start.temp=100) +
  geom_edge_link(aes(alpha=lift), 
                 colour = 'red',
                 arrow = arrow(length = unit(4, 'mm'))) + 
  geom_node_point(aes(size = centrality, colour = centrality)) + 
  geom_node_text(aes(label = name), size=4,
                 colour = 'gray20', repel=TRUE) +
  theme_graph()

Exportamos para examinar en Gephi:

write_csv(df_reglas %>% rename(source=from, target=to) %>%
            select(-count), 
          path='reglas.csv')
LS0tCnRpdGxlOiAiQW7DoWxpc2lzIGRlIGluZ3JlZGllbnRlcyBlbiByZWNldGFzIgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCgpBbsOhbGlzaXMgZGUKaHR0cHM6Ly93d3cubmF0dXJlLmNvbS9hcnRpY2xlcy9zcmVwMDAxOTYucGRmCgpQb2RlbW9zIHVzYXIgKnJlYWRfbGluZXNfY2h1bmtlZCogc2kgZWwgYXJjaGl2byBvcmlnaW5hbCBlcyBncmFuZGU6CgpgYGB7ciwgbWVzc2FnZSA9IEZBTFNFfQpsaWJyYXJ5KHRpZHl2ZXJzZSkKbGltcGlhciA8LSBmdW5jdGlvbihsaW5lYXMsLi4uKXsKICBzdHJfc3BsaXQobGluZWFzLCAnLCcpICU+JSAKICAgIGtlZXAoZnVuY3Rpb24oeCkgeFsxXSA9PSAnRWFzdEFzaWFuJykgJT4lCiAgICBtYXAoZnVuY3Rpb24oeCl7CiAgICAgICAgICBpbmcgPC0geFstMV0KICAgICAgICAgIGluZ1tuY2hhcihpbmcpID4gMF0KICAgICAgICB9KQp9CmZpbHRyYWRvIDwtIHJlYWRfbGluZXNfY2h1bmtlZCgnLi4vZGF0b3MvcmVjZXRhcy9zcmVwMDAxOTYtczMuY3N2JywKICAgICAgICAgICAgICAgICAgICBza2lwID0gMSwgY2FsbGJhY2sgPSBMaXN0Q2FsbGJhY2skbmV3KGxpbXBpYXIpKQpyZWNldGFzIDwtICBmaWx0cmFkbyAlPiUgZmxhdHRlbgpgYGAKCgpgYGB7ciwgbWVzc2FnZT1GQUxTRSwgd2FybmluZz1GQUxTRX0KbGlicmFyeShhcnVsZXMpCmxlbmd0aChyZWNldGFzKQpwYXJzIDwtIGxpc3Qoc3VwcG9ydCA9IDAuMDUsICB0YXJnZXQgPSAnZnJlcXVlbnQgaXRlbXNldHMnLAogICAgICAgICAgICAgZXh0ID0gVFJVRSkKYXBfcmVjZXRhcyA8LSBhcHJpb3JpKHJlY2V0YXMsIHBhcmFtZXRlciA9IHBhcnMpCmxlbmd0aChhcF9yZWNldGFzKQpgYGAKClZlbW9zIGxvcyBpdGVtcyBmcmVjdWVudGVzCgpgYGB7cn0KZnJlY3MgPC0gYXBfcmVjZXRhcyAlPiUgc3Vic2V0KHNpemUoLikgPT0gMSApICU+JSBzb3J0KGJ5ID0gJ3N1cHBvcnQnKSAlPiUKIERBVEFGUkFNRQpEVDo6ZGF0YXRhYmxlKGZyZWNzICU+JSBtdXRhdGVfaWYoaXMubnVtZXJpYywgZnVuY3Rpb24oeCkgcm91bmQoeCwgMykpKQpgYGAKClkgYWhvcmEgZXhhbWluYW1vcyBjb21iaW5hY2lvbmVzIGZyZWN1ZW50ZXMgZGUgZGlzdGludG9zIHRhbWHDsW9zCgpgYGB7cn0KYXBfcmVjZXRhcyAlPiUgCiAgc3Vic2V0KHNpemUoLikgPT0gMykgJT4lCiAgc3Vic2V0KHN1cHBvcnQgPiAwLjIwKSAlPiUKICBzb3J0KGJ5ID0gJ3N1cHBvcnQnKSAlPiUKICBpbnNwZWN0CmBgYAoKSW5jbHVzbyBoYXkgYWxndW5hcyBjb21iaW5hY2lvbmVzIGRlIDQgaW5ncmVkaWVudGVzIHF1ZSBvY3VycmVuIGNvbiBmcmVjdWVuY2lhIGFsdGE6CmVzdG9zIGluZ3JlZGllbnRlcyBzb24gYmFzZXMgZGUgc2Fsc2FzLCBjb21iaW5hY2lvbmVzIGRlIGNvbmRpbWVudG9zOgoKYGBge3J9CmFwX3JlY2V0YXMgJT4lIAogIHN1YnNldChzaXplKC4pID09IDQpICU+JQogIHN1YnNldChzdXBwb3J0ID4gMC4xMCkgJT4lCiAgc29ydChieSA9ICdzdXBwb3J0JykgJT4lCiAgaW5zcGVjdApgYGAKCgpgYGB7cn0KcGFycyA8LSBsaXN0KHN1cHBvcnQgPSAwLjAwNSwgY29uZmlkZW5jZSA9IDAuMTAsCiAgICAgICAgICAgICB0YXJnZXQgPSAncnVsZXMnLAogICAgICAgICAgICAgZXh0ID0gVFJVRSkKcmVnbGFzX3JlY2V0YXMgPC0gYXByaW9yaShyZWNldGFzLCBwYXJhbWV0ZXIgPSBwYXJzKQpgYGAKCmBgYHtyfQphZ3JlZ2FyX2h5cGVybGlmdCA8LSBmdW5jdGlvbihyZWdsYXMsIHRyYW5zKXsKICBxdWFsaXR5KHJlZ2xhcykgPC0gY2JpbmQocXVhbGl0eShyZWdsYXMpLCAKCWh5cGVyX2xpZnQgPSBpbnRlcmVzdE1lYXN1cmUocmVnbGFzLCBtZWFzdXJlID0gImh5cGVyTGlmdCIsIAoJdHJhbnNhY3Rpb25zID0gdHJhbnMpKQogIHJlZ2xhcwp9CnJlZ2xhc19yZWNldGFzIDwtIGFncmVnYXJfaHlwZXJsaWZ0KHJlZ2xhc19yZWNldGFzLCByZWNldGFzKQpgYGAKCgojIyBBbsOhbGlzaXMgZGUgcGFyZXMgY29tdW5lcwoKYGBge3J9CmxpYnJhcnkoYXJ1bGVzVml6KQpyZWdsYXNfMSA8LSBzdWJzZXQocmVnbGFzX3JlY2V0YXMsIGh5cGVyX2xpZnQgPiAxLjIgJiBzdXBwb3J0ID4gMC4wNSAmIGNvbmZpZGVuY2UgPiAwLjQwKQpsZW5ndGgocmVnbGFzXzEpCnJlZ2xhc190YW1fMiA8LSBzdWJzZXQocmVnbGFzXzEsIHNpemUocmVnbGFzXzEpPT0yKQojaW5zcGVjdChyZWdsYXNfdGFtXzIgJT4lIHNvcnQoYnkgPSAnaHlwZXJfbGlmdCcpKSAKcGxvdGx5X2FydWxlcyhyZWdsYXNfMSAlPiUgc3Vic2V0KHN1cHBvcnQgPiAwLjIpKQpgYGAKCmBgYHtyLCBmaWcud2lkdGg9MTAsIGZpZy5oZWlnaHQ9OH0KbGlicmFyeSh0aWR5Z3JhcGgpCmxpYnJhcnkoZ2dyYXBoKQpmcmVjcyA8LSAKZGZfcmVnbGFzIDwtIHJlZ2xhc190YW1fMiAlPiUgREFUQUZSQU1FICU+JSByZW5hbWUoZnJvbT1MSFMsIHRvPVJIUykgJT4lIGFzX2RhdGFfZnJhbWUKZGZfcmVnbGFzJHdlaWdodCA8LSBsb2coZGZfcmVnbGFzJGxpZnQpCmdyYXBoXzEgPC0gYXNfdGJsX2dyYXBoKGRmX3JlZ2xhcykgJT4lCiAgbXV0YXRlKGNlbnRyYWxpdHkgPSBjZW50cmFsaXR5X2RlZ3JlZShtb2RlID0gImFsbCIpKSAKCmdncmFwaChncmFwaF8xLCBsYXlvdXQgPSAnZnInKSArCiAgZ2VvbV9lZGdlX2xpbmsoYWVzKGFscGhhPWxpZnQpLCAKICAgICAgICAgICAgICAgICBjb2xvdXIgPSAncmVkJywKICAgICAgICAgICAgICAgICBhcnJvdyA9IGFycm93KGxlbmd0aCA9IHVuaXQoNCwgJ21tJykpKSArIAogIGdlb21fbm9kZV9wb2ludChhZXMoc2l6ZSA9IGNlbnRyYWxpdHksIGNvbG91ciA9IGNlbnRyYWxpdHkpKSArIAogIGdlb21fbm9kZV90ZXh0KGFlcyhsYWJlbCA9IG5hbWUpLCBzaXplPTQsCiAgICAgICAgICAgICAgICAgY29sb3VyID0gJ2dyYXkyMCcsIHJlcGVsPVRSVUUpICsKICB0aGVtZV9ncmFwaCgpCmBgYAoKCmBgYHtyfQpyZWdsYXNfMSA8LSBzdWJzZXQocmVnbGFzX3JlY2V0YXMsIGh5cGVyX2xpZnQgPiAxLjggJiBjb25maWRlbmNlID4gMC4xKQpsZW5ndGgocmVnbGFzXzEpCnJlZ2xhc190YW1fMiA8LSBzdWJzZXQocmVnbGFzXzEsIHNpemUocmVnbGFzXzEpPT0yKQpsZW5ndGgocmVnbGFzX3RhbV8yKQpgYGAKCmBgYHtyLCBmaWcud2lkdGg9MTAsIGZpZy5oZWlnaHQ9OH0KbGlicmFyeSh0aWR5Z3JhcGgpCmxpYnJhcnkoZ2dyYXBoKQpkZl9yZWdsYXMgPC0gcmVnbGFzX3RhbV8yICU+JSBEQVRBRlJBTUUgJT4lIHJlbmFtZShmcm9tPUxIUywgdG89UkhTKSAlPiUgYXNfZGF0YV9mcmFtZQpkZl9yZWdsYXMkd2VpZ2h0IDwtIGxvZyhkZl9yZWdsYXMkaHlwZXJfbGlmdCkKZ3JhcGhfMSA8LSBhc190YmxfZ3JhcGgoZGZfcmVnbGFzKSAlPiUKICBtdXRhdGUoY2VudHJhbGl0eSA9IGNlbnRyYWxpdHlfZGVncmVlKG1vZGUgPSAiYWxsIikpIAoKZ2dyYXBoKGdyYXBoXzEsIGxheW91dCA9ICdmcicsIHN0YXJ0LnRlbXA9MTAwKSArCiAgZ2VvbV9lZGdlX2xpbmsoYWVzKGFscGhhPWxpZnQpLCAKICAgICAgICAgICAgICAgICBjb2xvdXIgPSAncmVkJywKICAgICAgICAgICAgICAgICBhcnJvdyA9IGFycm93KGxlbmd0aCA9IHVuaXQoNCwgJ21tJykpKSArIAogIGdlb21fbm9kZV9wb2ludChhZXMoc2l6ZSA9IGNlbnRyYWxpdHksIGNvbG91ciA9IGNlbnRyYWxpdHkpKSArIAogIGdlb21fbm9kZV90ZXh0KGFlcyhsYWJlbCA9IG5hbWUpLCBzaXplPTQsCiAgICAgICAgICAgICAgICAgY29sb3VyID0gJ2dyYXkyMCcsIHJlcGVsPVRSVUUpICsKICB0aGVtZV9ncmFwaCgpCmBgYAoKRXhwb3J0YW1vcyBwYXJhIGV4YW1pbmFyIGVuIEdlcGhpOgoKCmBgYHtyfQp3cml0ZV9jc3YoZGZfcmVnbGFzICU+JSByZW5hbWUoc291cmNlPWZyb20sIHRhcmdldD10bykgJT4lCiAgICAgICAgICAgIHNlbGVjdCgtY291bnQpLCAKICAgICAgICAgIHBhdGg9J3JlZ2xhcy5jc3YnKQpgYGAK